# coding: utf-8
# author: Han Zhang

import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

import sys
sys.path.insert(0, "../lib/")
from dependency import *
from LSTM_text_dependency import *
from sklearn.externals import joblib
from joblib import Parallel, delayed

from keras.models import load_model
from keras.models import model_from_json
from keras.preprocessing import sequence
from keras.preprocessing import text

from scipy import stats
import random
import numpy as np
import json
import codecs
import os

import warnings
import sys
warnings.filterwarnings('ignore')
np.seterr(all='raise')




os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
from keras import backend as K
K.set_image_dim_ordering('tf')


import tensorflow as tf
global graph

graph = tf.get_default_graph()


def readWordPosFromFile(filepath):
	pos = codecs.open(filepath, 'r', encoding="utf-8").readlines()
	d = {}
	for i, v in enumerate(pos):
		d[v.split(",")[0]] = i
	return d


def getNormalTextString(eachline):
	words = eachline.strip().split()
	#if words:
	try:
		words = [x for x in words if x.split("_")[0] not in stop]
		words = [
			x.split("_")[0] for x in words if x.split("_")[1] in acceptlist
		]  ## 只取特定词性的词语
	except:
		words = [x for x in words if x not in stop]
		words = [x for x in words if x != " "]
	return words

def getWordSeq(words):
	## map from words to integer indexes (like the keras native
	# wordpos = readWordPosFromFile("/users/han/Dropbox/Collaborations/Jen Pan/predictEvent/vocab_pos.dict")
	poss = []
	for ww in words:  ## map words to position in
		pos = []
		for k in ww:
			try:
				if k in wordpos:  ### if in the dictionary
					pos.append(wordpos[k])
				### is continue the standard solution?
				### or you have to use anothe rsolution such as
				else:
					pos.append(0)
					# continue
			except:
				print(k.__class__, k)
		poss.append(pos)
	return poss



######################################################################
######################## Read required model files and dictionaries  #########################
######################################################################


# with open("/Users/han/Codes/Jen Pan/predictEvent/vocab/vocab_pos_KGP50000.dict") as json_file:
dirname = os.path.dirname(__file__)
filename = os.path.join(dirname, '../supporting/vocab_pos_KGP_50000.dict')

with open(filename) as json_file:
	wordpos = json.load(json_file)



filename = os.path.join(dirname, '../modelfiles/text-stage1.json')
json_file = open(filename, 'r')
loaded_model_json = json_file.read()
json_file.close()


model = model_from_json(loaded_model_json)
filename = os.path.join(dirname, "../modelfiles/weights_text-stage1.h5")
model.load_weights(filename)
model._make_predict_function()

def predict_text_deep(text):
	""" this function takes a (segmented) text as input 
	and output the first stage predicted probability of this text discussing grievances vs. protests, from the second-stage classifier


	Args:
		text (str): string that contains words to run classifiers on; need to be pre-segmented

	Returns:
		prob : predicted prob that this text is discussing protests

	"""

	# seq_grievance = string2sequence (text, wordpos)
	# sm_g = np.array([seq_grievance])

	# y = model.predict(sm_g) 

	if text == None:
		return 0
	words = text.split()

	word = getNormalTextString(text)
	words = []
	words.append(word) ### words are just splitted words

	X = getWordSeq(words)
	X = np.array(X)
	### why this sequence is so long... I think it is something wrong
	X_test = sequence.pad_sequences(X, maxlen=128)

	###### predict probability with texts
	#      y = model.predict(X)
	# with graph.as_default():

	y = model.predict(X_test)  ##### y is probabilities
	model._make_predict_function() 


	return y.flatten()[0]

if __name__ == '__main__':
	print predict_text_deep("山东 聊城 东昌府区 政府 采用 株连 公职人员 的 手段 促使 村民 签订 拆迁 协议 陈庄村 柳园 街道 陈庄村 四十余名 公职人员 因 亲属 没有 签订 拆迁 协议 而 遭 政府 通知 要 开除公职 另有 拆迁办 的 通告 称 不要 相信 律师 谗言 对多要 拆迁 补偿 抱有幻想".decode('utf-8'))

	# print Parallel(n_jobs = 4, backend="threading")(delayed(predict_text_deep)(x) for x in ["山东 聊城 东昌府区 政府 采用 株连 公职人员 的 手段 促使 村民 签订 拆迁 协议 陈庄村 柳园 街道 陈庄村 四十余名 公职人员 因 亲属 没有 签订 拆迁 协议 而 遭 政府 通知 要 开除公职 另有 拆迁办 的 通告 称 不要 相信 律师 谗言 对多要 拆迁 补偿 抱有幻想".decode('utf-8')])
